require(boot) # library boot includes a function cv.glm doing Cross Validation for a glm object #### ------------------ simulating different errors -------------------------------- nX=20 # number of observations X=array(1,dim=c(nX,6)) # intercept + 5 variables X[,-1]=rnorm(nX*5,0,1) # predictors beta.v=c(.5,1,-1,2,-2,0.1) # regression coefficients sigma0=3 # variance of response variable Y # first estimating the training error Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0) # training set X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y) Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat) # V1=intercept removed n.rep=10000 Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0) # evaluation data sum((Y.eval-rep(predict(Y.lm),times=n.rep))^2)/(nX*n.rep) # prediction error sum((Y-predict(Y.lm))^2)/(nX) # training error Y.glm=glm(Y~V2+V3+V4+V5+V6,data=X.dat) # using cross validation cv.err <- cv.glm(X.dat,Y.glm) # leave one out cross validation cv.err.5 <- cv.glm(X.dat, Y.glm, K=5) # 5-fold cross validation cv.err$delta # output includes the raw estimates and one bias corrected cv.err.5$delta # one estimate, based on one partition in 5 folds # cross validation - manually; to illustrate the process, not as efficient as eg cv.glm # first leave-one out LOO.tr=numeric(nX) for (i in 1:nX){ Y.lm.LOO=lm(Y~V2+V3+V4+V5+V6,data=X.dat[-i,]) # V1=intercept removed from data LOO.tr[i]=(Y[i]-predict(Y.lm.LOO,newdata=X.dat[i,]))^2 } summary(LOO.tr) # second 5-fold cross validation CV5.tr=numeric(5) ind.CV=split(sample(1:nX,replace=F),f=rep(1:5,each=(nX/5))) # create a list with indices defining the partition in 5 folds for (i in 1:5){ Y.lm.cv5=lm(Y~V2+V3+V4+V5+V6,data=X.dat[-ind.CV[[i]],]) # V1=intercept removed CV5.tr[i]=sum((Y[ind.CV[[i]]]-predict(Y.lm.cv5,newdata=X.dat[ind.CV[[i]],]))^2) } summary(CV5.tr) # Estimating the expected error (over different training sets) n.sim=200 # number of training sets train.err=pred.err=numeric(n.sim) err.LOO=err.CV5=array(0,dim=c(n.sim,2)) for (i in 1:n.sim) { Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0) # training set X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y) Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat) Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0) # evaluation data pred.err[i]=sum((Y.eval-rep(predict(Y.lm),times=n.rep))^2)/(nX*n.rep) # prediction error train.err[i]=sum((Y-predict(Y.lm))^2)/(nX) Y.glm=glm(Y~V2+V3+V4+V5+V6,data=X.dat) # using cross validation cv.err <- cv.glm(X.dat,Y.glm) # leave one out cross validation cv.err.5 <- cv.glm(X.dat, Y.glm, K=5) # 5-fold cross validation err.LOO[i,]=cv.err$delta err.CV5[i,]=cv.err.5$delta } summary(pred.err) summary(train.err) summary(err.LOO) summary(err.CV5) # theoretical value (eg Hastie et al 09 p. 224) sigma0^2+(6/nX)*sigma0^2 # same calculations but using a simpler model than the true model generating the data # a model simpler than the truth might be better in terms of prediction because its sampling variance is small compared to the true model # we consider two alternatives, ignoring important covariates V4 and V5, which is too simple and lead to large biases, # and ignoring the unimportant V6, which leads to some bias compensated by lower variability (and therefore lower MSE) Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0) # training set X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y) Y.lm.s1=lm(Y~V2+V3+V4+V5,data=X.dat) # first "simple" model: V6 removed Y.lm.s2=lm(Y~V2+V3,data=X.dat) # second "simple" model: V4, V5 and V6 removed n.rep=10000 Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0) # evaluation data to calculate the error sum((Y.eval-rep(predict(Y.lm.s1),times=n.rep))^2)/(nX*n.rep) # prediction error for model s1 sum((Y.eval-rep(predict(Y.lm.s2),times=n.rep))^2)/(nX*n.rep) # prediction error for model s2 sum((Y-predict(Y.lm.s1))^2)/(nX) # training error of s1 calculated on the sample (optimistic, ie too low) sum((Y-predict(Y.lm.s2))^2)/(nX) # training error of s2 # Calculating predicted values for n.sim training sets and the three models (full=true + two simple models s1 and s2) # bias is based on comparing average predictions to true predictions # and variance is the variance of predictions; the overall measure is MSE = bias^2 + variance n.sim=100 # number of training sets pred.tr=pred.tr1=pred.tr2=array(0,dim=c(n.sim,nX)) for (i in 1:n.sim) { Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0) # training set X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y) Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat) Y.lm.s1=lm(Y~V2+V3+V4+V5,data=X.dat) Y.lm.s2=lm(Y~V2+V3,data=X.dat) pred.tr[i,]=predict(Y.lm) pred.tr1[i,]=predict(Y.lm.s1) pred.tr2[i,]=predict(Y.lm.s2) } apply(pred.tr,MAR=2,mean) # average predictions as.vector(X%*%beta.v) # true value of predictions apply(pred.tr,MAR=2,mean) - as.vector(X%*%beta.v) # bias for true model, theoretically 0 apply(pred.tr1,MAR=2,mean) - as.vector(X%*%beta.v) # bias for simple model removing the unimportant variable apply(pred.tr2,MAR=2,mean) - as.vector(X%*%beta.v) # bias for simple model removing two important var and the unimportant variable apply(pred.tr,MAR=2,var) # variance for true model, theoretically given by x0 and H matrix, see eg Hastie et al 09 apply(pred.tr1,MAR=2,var) # variance for simple model removing the unimportant variable apply(pred.tr2,MAR=2,var) # variance for simple model removing two important var and the unimportant variable par(mfrow=c(3,1)) # plotting the bias, variance and MSE for the 20 predicted values (ie the 20 sample units) plot(1:20,(apply(pred.tr2,MAR=2,mean) - as.vector(X%*%beta.v))^2,pch=19) # simple model is biased but precise (low variance) points(1:20,(apply(pred.tr1,MAR=2,mean) - as.vector(X%*%beta.v))^2,col="blue",pch=19) # full model is unbiased but has relatively low precision points(1:20+.1,(apply(pred.tr,MAR=2,mean) - as.vector(X%*%beta.v))^2,col="green",pch=19) abline(0,0) title(main="BIAS SQUARED") plot(1:20,apply(pred.tr1,MAR=2,var),col="blue",pch=19,ylim=c(0,10)) points(1:20+.1,apply(pred.tr2,MAR=2,var),pch=19) points(1:20+.2,apply(pred.tr,MAR=2,var),col="green",pch=19) title(main="VARIANCE") sumsq=function(x) sum(x^2)/length(x) # function calculating the average sum of squares, used to calculate directly the MSE plot(1:20,apply(sweep(pred.tr2,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19) points(1:20,apply(sweep(pred.tr1,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19,col="blue") points(1:20,apply(sweep(pred.tr,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19,col="green") title(main="MSE") sum(apply(sweep(pred.tr,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq)) sum(apply(sweep(pred.tr1,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq)) sum(apply(sweep(pred.tr2,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq))